The vehicle may be viewed from one of many different angles.
• All the features are numeric i.e. geometric features extracted from the silhouette.
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#from sklearn.preprocessing import StandardScaler,LabelEncoder
#from scipy import stats
%matplotlib inline
sns.set_style('darkgrid')
%matplotlib inline
#from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")
#import plotly
#plotly.offline.init_notebook_mode()
#import plotly.graph_objs as go
#import plotly.tools as tls
#import plotly.figure_factory as ff
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.over_sampling import SMOTENC
#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
#from sklearn.ensemble import BaggingClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.ensemble import AdaBoostClassifier
#from statistics import mean
#from tkinter import *
#import tkinter as tk
#from tkinter import ttk
#import pickle
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
os.chdir('C:\\Users\\VGopalak\\Desktop\\ML\\un-supervised learning')
df=pd.read_csv('Part3 - vehicle.csv')
Data Exploration
df.head() #sample 5 records.
We can see all details related to vehicle is given.
df.shape #exploring shape of the dataset
We have 846 rows and 19 columns.
df.columns #exploring column names
df.info() #getting information about data
Missing value check
df.isnull().sum()
We have very few missing values
Missing Value Treatment
Filling Missing value with median
null_col=[]
for i,j in enumerate(df.columns):
if df.isnull().sum()[i]>0:
df[j].fillna(df[j].median(),inplace=True)
null_col.append(j)
else:
pass
#print(null_col)
df.dtypes
we have int,float and object datatypes
df.describe().T
Below variables has high maximum value compared to 75% of data
Scale of the variables are different. We need to do scaling of data.
Outliers Treatment
Filling ouliers with mean(mean without outliers)
for c in df.select_dtypes(include=['int64','float64']).columns:
#getting upper lower quartile values
q25,q75=np.percentile(df[c],25),np.percentile(df[c],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in df[c] if i < lower or i > upper]
#taking mean of a column without considering outliers
df_include = df.loc[(df[c] >= lower) & (df[c] <= upper)]
mean=int(df_include[c].mean())
#imputing outliers with mean
df[c]=np.where(df[c]>upper,mean,df[c])
df[c]=np.where(df[c]<lower,mean,df[c])
Outliers=[i for i in df[c] if i < lower or i > upper]
Distribution of numerical variables
outlier_col=[]
for i in df.select_dtypes(include=['int64','float64']).columns:
f, axes = plt.subplots(1, 2, figsize=(17,7))
sns.boxplot(x = i, data=df, orient='h' , ax=axes[1])
#sns.set(axis_bgcolor='k')
sns.distplot(df[i], ax=axes[0])
axes[0].set_title('Distribution plot of {}'.format(i))
axes[1].set_title('Box plot of {}'.format(i))
plt.show()
#checking count of outliers.
q25,q75=np.percentile(df[i],25),np.percentile(df[i],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in df[i] if i < lower or i > upper]
if len(Outliers)>0:
outlier_col.append(i)
print('{} Total Number of outliers in {}: {}'.format('\033[1m',i,len(Outliers)))
Correlation between numerical Variables.
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f', center = 1 ) # heatmap
plt.show()
Distribution of Target Variable
df['class'].value_counts()
f,axes=plt.subplots(1,2,figsize=(17,7))
df['class'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('class',data=df,ax=axes[1],order=['car','bus','van'])
axes[0].set_title('Class Variable Pie Chart')
axes[1].set_title('Class Variable Bar Graph')
plt.show()
Independent variables vs Dependent variables.
Checking mean values of each column with respect to target.
for i in df.select_dtypes(include=['int64','float64']).columns:
df.groupby(by=['class'])[i].mean().reset_index().sort_values([i]).tail(10).plot(x='class',
y=i,
kind='bar',
figsize=(15,5))
plt.show()
sns.pairplot(df,diag_kind='kde',diag_kws={'bw': 0.2})
df.dtypes
df['class'].value_counts()
df['class'].replace({'car':0,'bus':1,'van':2},inplace=True)
df['class']=df['class'].astype('category')
Splitting the dataset.
X=df.drop('class',axis=1)
y=df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
Grid Search CV.
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel':['rbf']}
grid = GridSearchCV(SVC(random_state=1), param_grid, refit = True)
# fitting the model for grid search
grid.fit(X_train, y_train)
print(grid.best_params_)
X=df.drop('class',axis=1)
y=df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
svc = SVC(C=100,gamma=0.0001,kernel='rbf')
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)
print(classification_report(y_test, predictions))
print("Accuracy on training set: {:.3f}".format(
svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(svc.score(X_test, y_test)))
#score_test=svc.score(X_test, y_test)
We will train our model in multiple folds and find out the mean score
def score_model(model,params,cv):
#smote=SMOTE(random_state=1)
train_score=[]
test_score=[]
for train_fold_index,test_fold_index in cv.split(X,y):
X_train,X_test=X.iloc[train_fold_index],X.iloc[test_fold_index]
y_train,y_test=y.iloc[train_fold_index],y.iloc[test_fold_index]
#X_train_res,y_train_res=smote.fit_resample(X_train,y_train)
SVM=model(**params).fit(X_train,y_train)
y_pred=SVM.predict(X_test)
train_score.append(SVM.score(X_train,y_train))
test_score.append(SVM.score(X_test,y_test))
print('Train_score:',np.mean(np.array(train_score)))
print('Test_score:',np.mean(np.array(test_score)))
return np.mean(np.array(test_score))
cv=StratifiedKFold(n_splits=10,random_state=1)
params={'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
b=score_model(SVC,params,cv)
We are getting 99% mean accuracy in training data and 98% mean accuracy in test data. This is a good model.
We will use PCA to reduce the number of dimensions.
df.shape
Initially we have 18 independent columns and one dependent column.
Covariance Matrix
covMatrix = np.cov(X,rowvar=False)
print(covMatrix.shape)
Covariance indicates the level to which two variables vary together.
Building PCA
Scaling the data
li=list(df.select_dtypes(include=['int64','float64']).columns)
print(li)
X[li]=X[li].apply(zscore) #replacing each continuous values with z-score value
pca = PCA(n_components=18) #initially we will create 18 components
pca.fit(X)
Eigen Values
print(pca.explained_variance_)
We can see 18 eigen values which is corresponds to each principal component. Eigen values tells maximum ariance explained by each principal component.
Eigen Vectors
print((pca.components_.shape))
We have built 18 Principal components.
Explained Variance Ratio
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
print(np.cumsum(pca.explained_variance_ratio_))
We can see 8 components explained 97% of variance in the data.
Dimensionality Reduction
Now 8 dimensions seems very reasonable. With 8 variables we can explain over 97% of the variation in the original data!
pca4 = PCA(n_components=8)
pca4.fit(X)
Xpca4 = pca4.transform(X)
sns.pairplot(pd.DataFrame(Xpca4))
We can see all the features are not correlated with each others.This is because each principal component is perpendicular to each other.
Fitting SVM With Reduced Attributes
X_train, X_test, y_train, y_test = train_test_split(Xpca4, y, test_size=0.30, random_state=1)
svc = SVC(C=1000,random_state=1)
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)
print(classification_report(y_test, predictions))
print("Accuracy on training set: {:.3f}".format(
svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(svc.score(X_test, y_test)))
Training accuracy is 1 we will adjust hyper parameter to overcome this issue.
Grid Search CV
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(random_state=1), param_grid, refit = True)
# fitting the model for grid search
grid.fit(X_train, y_train)
print(grid.best_params_)
def score_model_pca(model,params,cv):
#smote=SMOTE(random_state=1)
train_score=[]
test_score=[]
for train_fold_index,test_fold_index in cv.split(Xpca4,y):
X_train,X_test=Xpca4[train_fold_index],Xpca4[test_fold_index]
y_train,y_test=y.iloc[train_fold_index],y.iloc[test_fold_index]
#X_train_res,y_train_res=smote.fit_resample(X_train,y_train)
SVM=model(**params).fit(X_train,y_train)
y_pred=SVM.predict(X_test)
train_score.append(SVM.score(X_train,y_train))
test_score.append(SVM.score(X_test,y_test))
print('Train_score:',np.mean(np.array(train_score)))
print('Test_score:',np.mean(np.array(test_score)))
return np.mean(np.array(test_score))
cv=StratifiedKFold(n_splits=10,random_state=1)
params={'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
a=score_model_pca(SVC,params,cv)
We are getting balanced score in train and test data.
result = pd.DataFrame({'SVC' : ['All attributes', '8 Principle components'],
'Accuracy' : [b,a]})
result